.libPaths(c("/hpc/packages/minerva-centos7/rpackages/4.2.0/site-library", "/hpc/packages/minerva-centos7/rpackages/bioconductor/3.15", .libPaths()))
library(sctransform, lib.loc = "/hpc/users/richaa21/.Rlib")
library(Seurat, lib.loc = "/hpc/users/richaa21/.Rlib")
## Loading required package: SeuratObject
## Loading required package: sp
## 'SeuratObject' was built under R 4.2.0 but the current version is
## 4.3.0; it is recomended that you reinstall 'SeuratObject' as the ABI
## for R may have changed
## 'SeuratObject' was built with package 'Matrix' 1.6.4 but the current
## version is 1.6.5; it is recomended that you reinstall 'SeuratObject' as
## the ABI for 'Matrix' may have changed
##
## Attaching package: 'SeuratObject'
## The following object is masked from 'package:base':
##
## intersect
library(patchwork)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
pbmc.data <- Read10X(data.dir = "/sc/arion/projects/ad-omics/ashley/data/aF/outs/per_sample_outs/aF/count/sample_filtered_feature_bc_matrix")
pbmc.f <- CreateSeuratObject(counts = pbmc.data, project = "fibril", min.cells = 3, min.features = 200)
pbmc.f
## An object of class Seurat
## 23791 features across 18327 samples within 1 assay
## Active assay: RNA (23791 features, 0 variable features)
## 1 layer present: counts
Lets check how many cells and features we are starting with.
length(colnames(pbmc.f)) #number of cells
## [1] 18327
length(rownames(pbmc.f)) #number of features
## [1] 23791
a. QC mitochondiral genes. The PercentFeatureSet() calculates the % of counts originating from a set of features - here we are first looking at mitochondiral features, which are genes starting with MT.
b. By using [[ ]], I am adding columns to the pbmc matrix to store this QC data.
pbmc.f[["percent.mt"]] <- PercentageFeatureSet(pbmc.f, pattern = "^MT-")
# Show QC metrics for the first 5 cells
head(pbmc.f@meta.data, 5)
## orig.ident nCount_RNA nFeature_RNA percent.mt
## AAACCTGAGAAACCAT-1 fibril 7898 2915 4.304887
## AAACCTGAGAAGGACA-1 fibril 10204 3359 5.556644
## AAACCTGAGAATCTCC-1 fibril 15411 4590 2.602038
## AAACCTGAGAATGTGT-1 fibril 8500 2962 4.141176
## AAACCTGAGACGCACA-1 fibril 30572 6417 3.185922
library(farver, lib.loc = "/hpc/packages/minerva-centos7/rpackages/4.2.0/site-library")
VlnPlot(pbmc.f, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
plot1 <- FeatureScatter(pbmc.f, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(pbmc.f, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1
plot2
# I will select for %mt under 10% and features greater than 200 to capture alive healthy cells.
pbmc.f <- subset(pbmc.f, subset = nFeature_RNA > 200 & percent.mt < 10)
head(pbmc.f@meta.data)
## orig.ident nCount_RNA nFeature_RNA percent.mt
## AAACCTGAGAAACCAT-1 fibril 7898 2915 4.304887
## AAACCTGAGAAGGACA-1 fibril 10204 3359 5.556644
## AAACCTGAGAATCTCC-1 fibril 15411 4590 2.602038
## AAACCTGAGAATGTGT-1 fibril 8500 2962 4.141176
## AAACCTGAGACGCACA-1 fibril 30572 6417 3.185922
## AAACCTGAGCTGAAAT-1 fibril 7030 2652 3.911807
The data is normalized based on the feature (number of genes in a cell) by the total expression. This number is multiplied by 10,000 and then log transformed. The function to do this is “NormalizeData.” The values specied below are the default values of this function.
pbmc.f <- NormalizeData(pbmc.f, normalization.method = "LogNormalize", scale.factor = 10000)
## Normalizing layer: counts
pbmc.f <- FindVariableFeatures(pbmc.f, selection.method = "vst", nfeatures = 2000)
## Finding variable features for layer counts
# Identify the 10 most highly variable genes
top10 <- head(VariableFeatures(pbmc.f), 10)
# plot variable features with and without labels
plot1 <- VariableFeaturePlot(pbmc.f)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE, xnudge = 0, ynudge = 0)
plot1
plot2
## Scaling the Data The data is scaled by doing a linear transformation.
The ScaleData function does this by shifting the expression of genes so
that the mean expression becomes 0 and the variance is 1. By default,
only variable genes are scaled. This is changed by features =
all.genes
##Scaling RNA data, we only scale the variable features here for efficiency
all.genes <- rownames(pbmc.f)
pbmc.f <- ScaleData(pbmc.f, vars.to.regress = c("percent.mt"))
## Regressing out percent.mt
## Centering and scaling data matrix
For the first principal components, RunPCA shows the most positive (correlated) and most negative (anticorrelated) genes
pbmc.f <- RunPCA(pbmc.f, features = VariableFeatures(object = pbmc.f))
## PC_ 1
## Positive: LTB, TSC22D3, IL7R, FP236383.3, TRIM22, TCF7, AQP3, IFITM1, TNFSF8, LEF1
## PIM2, CCR7, JAML, C1orf162, LINC00861, CORO1B, NEAT1, MAL, MYC, CD5
## NCDN, BEX3, TIMP1, CD4, SESN3, S100A6, BBC3, HERPUD1, AC139720.1, CYSLTR1
## Negative: RRM2, TYMS, MKI67, TUBA1B, STMN1, PCLAF, CDK1, UBE2C, TOP2A, KIFC1
## NUSAP1, ASF1B, ZWINT, TUBB, CCNA2, ASPM, CENPF, GTSE1, PKMYT1, HIST1H1B
## KNL1, CDCA8, CDKN3, SPC25, GGH, TPX2, CKAP2L, BIRC5, CENPU, CDCA2
## PC_ 2
## Positive: CTSW, PRF1, GZMB, CCL4, NKG7, CCL3, KLRD1, TYROBP, GZMA, KLRC1
## FCER1G, AOAH, CST7, IL2RB, CCL5, SRGN, TRDC, GNLY, HOPX, KLRK1
## FCGR3A, GZMK, NCAM1, GSTP1, KIR2DL4, LYST, EOMES, KLRC2, CD38, SH2D1B
## Negative: IL7R, TIMP1, AQP3, S100A6, LIME1, COTL1, S100A10, S100A4, CD5, TSC22D3
## SNED1, IFITM1, GPR183, WDR86-AS1, CD4, ANP32B, CORO1B, LEF1, ITGB1, LMNA
## MYC, CRIP1, STAT1, PLP2, PRDX1, SOS1, INPP4B, RGCC, UBXN11, IL9R
## PC_ 3
## Positive: PLK1, CLIC3, PLEK, TYROBP, FCER1G, CEBPD, FGFBP2, CENPA, CX3CR1, CCNB1
## DLGAP5, FCGR3A, ASPM, KLRD1, KIF20A, PSRC1, FGR, HMMR, CCNB2, CDKN2D
## CD300A, PRF1, GZMK, SH2D1B, KIF14, CENPF, PIMREG, TROAP, NEK2, TRDC
## Negative: GINS2, HSP90AB1, MCM2, MCM6, CSF2, MCM5, TRBV6-2, PLPP1, PAICS, CDC6
## TRAV12-2, RANBP1, MCM7, UNG, MCM3, MCM4, HSPE1, NME1, MIF, CDC45
## SRM, HSPD1, DCTPP1, E2F1, CD82, TCTN3, IL2RA, TPI1, PHLDA1, DUSP4
## PC_ 4
## Positive: TRBV6-2, CD27, TRAV12-2, LRRN3, PLPP1, CD8A, PECAM1, CD8B, MYO1E, SNX9
## SIRPG, REG4, RASGRF2, BCL2L11, CBLB, BACH2, CSF2, LAIR2, AL136962.1, PDE3B
## APBB2, BHLHE40-AS1, CRTAM, LYST, DUSP2, TRBV5-4, CCR7, GNG8, ITGA1, TCF7
## Negative: HLA-DRB1, HLA-DPA1, S100A4, LGALS1, S100A6, S100A10, HLA-DPB1, HLA-DRA, HLA-DQA1, PLEK
## ANXA2, TXN, HLA-DQB1, LGALS3, HLA-DRB5, CRIP1, CD74, TMSB10, PFN1, CD300A
## TAGLN2, FTL, TYROBP, CLU, GINS2, TBXAS1, IFITM1, LIME1, TIMP1, EFHD2
## PC_ 5
## Positive: LGALS1, S100A4, S100A6, ANXA2, C12orf75, ACTB, S100A10, TXN, HOPX, PFN1
## GAPDH, ACTG1, TMSB10, CFL1, CD74, LAG3, LINC02694, PRDX1, HLA-DPA1, VIM
## PHLDA1, FLNA, HLA-DRB1, JPT1, AHNAK, HLA-DRA, THEMIS, ALOX5AP, CD82, TIMP1
## Negative: CCR7, TCF7, LEF1, TRIM22, SESN3, TYROBP, CXCR4, FAM111B, FCER1G, PTMA
## TXK, RNF130, PLAC8, MCM7, DHRS3, PTGDR, DTL, CYSLTR1, CDC6, CCNE2
## CLSPN, SELL, TK1, MYBL2, SH2D1B, FEN1, LINC00861, FHIT, KCNQ5, NCAM1
print(pbmc.f[["pca"]], dims = 1:5, nfeatures = 5)
## PC_ 1
## Positive: LTB, TSC22D3, IL7R, FP236383.3, TRIM22
## Negative: RRM2, TYMS, MKI67, TUBA1B, STMN1
## PC_ 2
## Positive: CTSW, PRF1, GZMB, CCL4, NKG7
## Negative: IL7R, TIMP1, AQP3, S100A6, LIME1
## PC_ 3
## Positive: PLK1, CLIC3, PLEK, TYROBP, FCER1G
## Negative: GINS2, HSP90AB1, MCM2, MCM6, CSF2
## PC_ 4
## Positive: TRBV6-2, CD27, TRAV12-2, LRRN3, PLPP1
## Negative: HLA-DRB1, HLA-DPA1, S100A4, LGALS1, S100A6
## PC_ 5
## Positive: LGALS1, S100A4, S100A6, ANXA2, C12orf75
## Negative: CCR7, TCF7, LEF1, TRIM22, SESN3
The PCA results can be visualized in different ways.
VizDimLoadings(pbmc.f, dims = 1:2, reduction = "pca")
DimPlot(pbmc.f, reduction = "pca") + NoLegend()
DimHeatmap(pbmc.f, dims = 1, cells = 500, balanced = TRUE)
DimHeatmap(pbmc.f, dims = 1:15, cells = 500, balanced = TRUE)
Cells will be clustered based on PCA. How many PC to use is dependent on many factors. For example, if trying to analyze a rare cell subset, you might want to add more PCs. Usually, the first 10 is good to see dimensionality of the data.
ElbowPlot(pbmc.f)
##We select the top 15 PCs for clustering and tSNE based on PCElbowPlot
pbmc.f <- FindNeighbors(pbmc.f, reduction = "pca", dims = 1:15)
## Computing nearest neighbor graph
## Computing SNN
pbmc.f <- FindClusters(pbmc.f, resolution = 0.5, verbose = FALSE)
head(Idents(pbmc.f), 5)
## AAACCTGAGAAACCAT-1 AAACCTGAGAAGGACA-1 AAACCTGAGAATCTCC-1 AAACCTGAGAATGTGT-1
## 4 1 9 1
## AAACCTGAGACGCACA-1
## 7
## Levels: 0 1 2 3 4 5 6 7 8 9 10 11 12
pbmc.f <- RunUMAP(pbmc.f, dims = 1:15)
## 10:42:02 UMAP embedding parameters a = 0.9922 b = 1.112
## 10:42:02 Read 17726 rows and found 15 numeric columns
## 10:42:02 Using Annoy for neighbor search, n_neighbors = 30
## 10:42:02 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 10:42:04 Writing NN index file to temp file /tmp/RtmpRlAiI7/filed09935ef3302
## 10:42:04 Searching Annoy index using 1 thread, search_k = 3000
## 10:42:09 Annoy recall = 100%
## 10:42:10 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 10:42:11 Initializing from normalized Laplacian + noise (using RSpectra)
## 10:42:11 Commencing optimization for 200 epochs, with 755922 positive edges
## 10:42:33 Optimization finished
# note that you can set `label = TRUE` or use the LabelClusters function to help label
# individual clusters
DimPlot(pbmc.f, reduction = "umap", label = TRUE)
pbmc.f <- RunTSNE(pbmc.f, reduction = "pca", dims = 1:15)
DimPlot(pbmc.f, reduction = "tsne", label = TRUE)
Lets check our metadata now of the seurat object to see what has been added.
head(pbmc.f@meta.data)
## orig.ident nCount_RNA nFeature_RNA percent.mt
## AAACCTGAGAAACCAT-1 fibril 7898 2915 4.304887
## AAACCTGAGAAGGACA-1 fibril 10204 3359 5.556644
## AAACCTGAGAATCTCC-1 fibril 15411 4590 2.602038
## AAACCTGAGAATGTGT-1 fibril 8500 2962 4.141176
## AAACCTGAGACGCACA-1 fibril 30572 6417 3.185922
## AAACCTGAGCTGAAAT-1 fibril 7030 2652 3.911807
## RNA_snn_res.0.5 seurat_clusters
## AAACCTGAGAAACCAT-1 4 4
## AAACCTGAGAAGGACA-1 1 1
## AAACCTGAGAATCTCC-1 9 9
## AAACCTGAGAATGTGT-1 1 1
## AAACCTGAGACGCACA-1 7 7
## AAACCTGAGCTGAAAT-1 0 0
#We now have seurat clusters and RNA_snn_res.0.5 columns added.
# find markers for every cluster compared to all remaining cells, report only the positive
# ones
markers_f <- FindAllMarkers(pbmc.f, only.pos = TRUE)
## Calculating cluster 0
## Calculating cluster 1
## Calculating cluster 2
## Calculating cluster 3
## Calculating cluster 4
## Calculating cluster 5
## Calculating cluster 6
## Calculating cluster 7
## Calculating cluster 8
## Calculating cluster 9
## Calculating cluster 10
## Calculating cluster 11
## Calculating cluster 12
VlnPlot will display the differential expression across the clusters. For example, I am looking here at CD8A and CD4 expression in the clusters.
VlnPlot(pbmc.f, features = c("CD8A", "CD4"))
The raw counts can also be shown instead by adding some parameters.
VlnPlot(pbmc.f, features = c("CD8A", "CD4"), slot = "counts", log = TRUE)
FeaturePlot(pbmc.f, features = c("MS4A1", "GNLY", "CD3E", "CD14", "FCER1A", "FCGR3A", "LYZ", "PPBP",
"CD8A"))
.libPaths(c("/hpc/packages/minerva-centos7/rpackages/4.2.0/site-library", "/hpc/packages/minerva-centos7/rpackages/bioconductor/3.15", .libPaths()))
library(patchwork)
library(tidyr)
f_demuxlet = read.delim("/sc/arion/projects/ad-omics/ashley/data/aF.best", header = T, stringsAsFactors = F, check.names = F)
head(f_demuxlet)
## BARCODE RD.TOTL RD.PASS RD.UNIQ N.SNP BEST
## 1 AAACCTGAGAAACCAT-1 20230 2729 2289 1191 SNG-GSA8_0_NYUMD0334-01
## 2 AAACCTGAGAAGGACA-1 28220 3441 2915 1485 SNG-GSA8_0_NYUMD0289-01
## 3 AAACCTGAGAATCTCC-1 43192 6406 5064 2716 SNG-GSA8_0_NYUMD0327-01
## 4 AAACCTGAGAATGTGT-1 23164 3096 2623 1317 SNG-GSA8_0_NYUMD0334-01
## 5 AAACCTGAGACGCACA-1 80482 12257 10037 4265 SNG-GSA8_0_NYUMD0334-01
## 6 AAACCTGAGCTGAAAT-1 18804 2299 1930 1096 SNG-GSA8_0_NYUMD0289-01
## SNG.1ST SNG.LLK1 SNG.2ND SNG.LLK2 SNG.LLK0
## 1 GSA8_0_NYUMD0334-01 -436.0108 GSA8_0_NYUMD0289-01 -1144.530 -700.9854
## 2 GSA8_0_NYUMD0289-01 -551.2455 GSA8_0_NYUMD0327-01 -1348.805 -849.1844
## 3 GSA8_0_NYUMD0327-01 -978.9799 GSA8_0_NYUMD0334-01 -2477.158 -1538.9178
## 4 GSA8_0_NYUMD0334-01 -501.0333 GSA8_0_NYUMD0315-01 -1271.823 -786.8994
## 5 GSA8_0_NYUMD0334-01 -1514.3772 GSA8_0_NYUMD0289-01 -3999.847 -2477.8828
## 6 GSA8_0_NYUMD0289-01 -422.6331 GSA8_0_NYUMD0334-01 -1018.341 -640.8195
## DBL.1ST DBL.2ND ALPHA LLK12 LLK1
## 1 GSA8_0_NYUMD0315-01 GSA8_0_NYUMD0334-01 0.5 -650.5111 -1179.6639
## 2 GSA8_0_NYUMD0289-01 GSA8_0_NYUMD0327-01 0.5 -798.6921 -551.2455
## 3 GSA8_0_NYUMD0327-01 GSA8_0_NYUMD0334-01 0.5 -1423.6142 -978.9799
## 4 GSA8_0_NYUMD0327-01 GSA8_0_NYUMD0334-01 0.5 -735.1817 -1273.6766
## 5 GSA8_0_NYUMD0289-01 GSA8_0_NYUMD0334-01 0.5 -2273.7404 -3999.8471
## 6 GSA8_0_NYUMD0289-01 GSA8_0_NYUMD0334-01 0.5 -599.9499 -422.6331
## LLK2 LLK10 LLK20 LLK00 PRB.DBL PRB.SNG1
## 1 -436.0108 -1007.9044 -658.1725 -717.9619 2.33e-94 1
## 2 -1348.8051 -555.0604 -798.6921 -875.2910 1.14e-108 1
## 3 -2477.1579 -1464.5035 -2200.8787 -1583.0830 2.63e-194 1
## 4 -501.0333 -1136.8613 -738.3541 -809.4175 8.21e-103 1
## 5 -1514.3772 -4005.0462 -2273.7404 -2537.6268 0.00e+00 1
## 6 -1018.3409 -424.8764 -599.9499 -659.8594 3.29e-78 1
To edit: I will split the Best column into multiple columns.
f_demuxlet_edit = f_demuxlet %>%
mutate(BEST = gsub("-01","", BEST)) %>%
separate(BEST, into=c("DMX_classification.global","DMX_maxID","DMX_secondID"), sep="-") %>%
separate(DMX_maxID, into=c("DMX_garbage1","DMX_garbage2","DMX_maxID"), sep ="_") %>%
separate(DMX_secondID, into=c("DMX_garbage3","DMX_garbage4","DMX_secondID"), sep ="_") %>%
select(-contains("garbage"))
## Warning: Expected 3 pieces. Additional pieces discarded in 2522 rows [13, 18,
## 20, 25, 27, 29, 35, 48, 63, 66, 69, 71, 79, 82, 100, 102, 103, 107, 116, 124,
## ...].
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 15896 rows [1,
## 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 17, 19, 21, 22, 23, ...].
head(f_demuxlet_edit)
## BARCODE RD.TOTL RD.PASS RD.UNIQ N.SNP DMX_classification.global
## 1 AAACCTGAGAAACCAT-1 20230 2729 2289 1191 SNG
## 2 AAACCTGAGAAGGACA-1 28220 3441 2915 1485 SNG
## 3 AAACCTGAGAATCTCC-1 43192 6406 5064 2716 SNG
## 4 AAACCTGAGAATGTGT-1 23164 3096 2623 1317 SNG
## 5 AAACCTGAGACGCACA-1 80482 12257 10037 4265 SNG
## 6 AAACCTGAGCTGAAAT-1 18804 2299 1930 1096 SNG
## DMX_maxID DMX_secondID SNG.1ST SNG.LLK1 SNG.2ND
## 1 NYUMD0334 <NA> GSA8_0_NYUMD0334-01 -436.0108 GSA8_0_NYUMD0289-01
## 2 NYUMD0289 <NA> GSA8_0_NYUMD0289-01 -551.2455 GSA8_0_NYUMD0327-01
## 3 NYUMD0327 <NA> GSA8_0_NYUMD0327-01 -978.9799 GSA8_0_NYUMD0334-01
## 4 NYUMD0334 <NA> GSA8_0_NYUMD0334-01 -501.0333 GSA8_0_NYUMD0315-01
## 5 NYUMD0334 <NA> GSA8_0_NYUMD0334-01 -1514.3772 GSA8_0_NYUMD0289-01
## 6 NYUMD0289 <NA> GSA8_0_NYUMD0289-01 -422.6331 GSA8_0_NYUMD0334-01
## SNG.LLK2 SNG.LLK0 DBL.1ST DBL.2ND ALPHA LLK12
## 1 -1144.530 -700.9854 GSA8_0_NYUMD0315-01 GSA8_0_NYUMD0334-01 0.5 -650.5111
## 2 -1348.805 -849.1844 GSA8_0_NYUMD0289-01 GSA8_0_NYUMD0327-01 0.5 -798.6921
## 3 -2477.158 -1538.9178 GSA8_0_NYUMD0327-01 GSA8_0_NYUMD0334-01 0.5 -1423.6142
## 4 -1271.823 -786.8994 GSA8_0_NYUMD0327-01 GSA8_0_NYUMD0334-01 0.5 -735.1817
## 5 -3999.847 -2477.8828 GSA8_0_NYUMD0289-01 GSA8_0_NYUMD0334-01 0.5 -2273.7404
## 6 -1018.341 -640.8195 GSA8_0_NYUMD0289-01 GSA8_0_NYUMD0334-01 0.5 -599.9499
## LLK1 LLK2 LLK10 LLK20 LLK00 PRB.DBL PRB.SNG1
## 1 -1179.6639 -436.0108 -1007.9044 -658.1725 -717.9619 2.33e-94 1
## 2 -551.2455 -1348.8051 -555.0604 -798.6921 -875.2910 1.14e-108 1
## 3 -978.9799 -2477.1579 -1464.5035 -2200.8787 -1583.0830 2.63e-194 1
## 4 -1273.6766 -501.0333 -1136.8613 -738.3541 -809.4175 8.21e-103 1
## 5 -3999.8471 -1514.3772 -4005.0462 -2273.7404 -2537.6268 0.00e+00 1
## 6 -422.6331 -1018.3409 -424.8764 -599.9499 -659.8594 3.29e-78 1
table(f_demuxlet_edit$DMX_classification.global) #num of singlets and doublets
##
## AMB DBL SNG
## 17 2505 15896
table(f_demuxlet_edit$DMX_maxID) #number of cells identified as each donor
##
## NYUMD0289 NYUMD0315 NYUMD0327 NYUMD0334
## 6750 3559 3609 4500
table(f_demuxlet_edit[,c("DMX_classification.global","DMX_maxID")]) #number of singlets or doublets identified as each donor
## DMX_maxID
## DMX_classification.global NYUMD0289 NYUMD0315 NYUMD0327 NYUMD0334
## AMB 5 4 4 4
## DBL 1481 610 399 15
## SNG 5264 2945 3206 4481
f_demuxlet_edit.subset <- f_demuxlet_edit[f_demuxlet_edit$BARCODE %in% colnames(pbmc.f),]
pbmc.f@meta.data <- cbind(pbmc.f@meta.data,f_demuxlet_edit.subset$DMX_maxID,f_demuxlet_edit.subset$DMX_classification.global, f_demuxlet_edit.subset$BARCODE)
head(pbmc.f@meta.data)
## orig.ident nCount_RNA nFeature_RNA percent.mt
## AAACCTGAGAAACCAT-1 fibril 7898 2915 4.304887
## AAACCTGAGAAGGACA-1 fibril 10204 3359 5.556644
## AAACCTGAGAATCTCC-1 fibril 15411 4590 2.602038
## AAACCTGAGAATGTGT-1 fibril 8500 2962 4.141176
## AAACCTGAGACGCACA-1 fibril 30572 6417 3.185922
## AAACCTGAGCTGAAAT-1 fibril 7030 2652 3.911807
## RNA_snn_res.0.5 seurat_clusters
## AAACCTGAGAAACCAT-1 4 4
## AAACCTGAGAAGGACA-1 1 1
## AAACCTGAGAATCTCC-1 9 9
## AAACCTGAGAATGTGT-1 1 1
## AAACCTGAGACGCACA-1 7 7
## AAACCTGAGCTGAAAT-1 0 0
## f_demuxlet_edit.subset$DMX_maxID
## AAACCTGAGAAACCAT-1 NYUMD0334
## AAACCTGAGAAGGACA-1 NYUMD0289
## AAACCTGAGAATCTCC-1 NYUMD0327
## AAACCTGAGAATGTGT-1 NYUMD0334
## AAACCTGAGACGCACA-1 NYUMD0334
## AAACCTGAGCTGAAAT-1 NYUMD0289
## f_demuxlet_edit.subset$DMX_classification.global
## AAACCTGAGAAACCAT-1 SNG
## AAACCTGAGAAGGACA-1 SNG
## AAACCTGAGAATCTCC-1 SNG
## AAACCTGAGAATGTGT-1 SNG
## AAACCTGAGACGCACA-1 SNG
## AAACCTGAGCTGAAAT-1 SNG
## f_demuxlet_edit.subset$BARCODE
## AAACCTGAGAAACCAT-1 AAACCTGAGAAACCAT-1
## AAACCTGAGAAGGACA-1 AAACCTGAGAAGGACA-1
## AAACCTGAGAATCTCC-1 AAACCTGAGAATCTCC-1
## AAACCTGAGAATGTGT-1 AAACCTGAGAATGTGT-1
## AAACCTGAGACGCACA-1 AAACCTGAGACGCACA-1
## AAACCTGAGCTGAAAT-1 AAACCTGAGCTGAAAT-1
The Seurat Object has already been preprocessed in my case, so this should be clean)
pbmc.f[["percent.mt"]] <- PercentageFeatureSet(pbmc.f, pattern = "^MT-")
library(cowplot)
##
## Attaching package: 'cowplot'
## The following object is masked from 'package:patchwork':
##
## align_plots
# look at distribution of metrics by classification
plot_grid(VlnPlot(pbmc.f, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3, group.by = "f_demuxlet_edit.subset$DMX_maxID"))
VlnPlot(pbmc.f, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3, group.by = "f_demuxlet_edit.subset$DMX_classification.global")
Now, select for MT content under 10% and for nfeatureRNA > 200 to ensure getting alive cells. I previously did this, so should see no change in cell numnbers.
pbmc.f <- subset(pbmc.f, subset = nFeature_RNA > 200 & percent.mt < 10)
length(colnames(pbmc.f))
## [1] 17726
length(rownames(pbmc.f))
## [1] 23791
Add column to meta data to identify seurat object as Basline condition. Also rename some columns for clarity purposes.
pbmc.f@meta.data$condition <- 'Fibril'
names(pbmc.f@meta.data)[names(pbmc.f@meta.data) == "f_demuxlet_edit.subset$DMX_classification.global"] <- "DMX_classification.global"
names(pbmc.f@meta.data)[names(pbmc.f@meta.data) == "f_demuxlet_edit.subset$DMX_maxID"] <- "DMX_maxID"
names(pbmc.f@meta.data)[names(pbmc.f@meta.data) == "f_demuxlet_edit.subset$BARCODE"] <- "Barcode"
head(pbmc.f@meta.data)
## orig.ident nCount_RNA nFeature_RNA percent.mt
## AAACCTGAGAAACCAT-1 fibril 7898 2915 4.304887
## AAACCTGAGAAGGACA-1 fibril 10204 3359 5.556644
## AAACCTGAGAATCTCC-1 fibril 15411 4590 2.602038
## AAACCTGAGAATGTGT-1 fibril 8500 2962 4.141176
## AAACCTGAGACGCACA-1 fibril 30572 6417 3.185922
## AAACCTGAGCTGAAAT-1 fibril 7030 2652 3.911807
## RNA_snn_res.0.5 seurat_clusters DMX_maxID
## AAACCTGAGAAACCAT-1 4 4 NYUMD0334
## AAACCTGAGAAGGACA-1 1 1 NYUMD0289
## AAACCTGAGAATCTCC-1 9 9 NYUMD0327
## AAACCTGAGAATGTGT-1 1 1 NYUMD0334
## AAACCTGAGACGCACA-1 7 7 NYUMD0334
## AAACCTGAGCTGAAAT-1 0 0 NYUMD0289
## DMX_classification.global Barcode condition
## AAACCTGAGAAACCAT-1 SNG AAACCTGAGAAACCAT-1 Fibril
## AAACCTGAGAAGGACA-1 SNG AAACCTGAGAAGGACA-1 Fibril
## AAACCTGAGAATCTCC-1 SNG AAACCTGAGAATCTCC-1 Fibril
## AAACCTGAGAATGTGT-1 SNG AAACCTGAGAATGTGT-1 Fibril
## AAACCTGAGACGCACA-1 SNG AAACCTGAGACGCACA-1 Fibril
## AAACCTGAGCTGAAAT-1 SNG AAACCTGAGCTGAAAT-1 Fibril
saveRDS(pbmc.f, file = "/sc/arion/projects/ad-omics/ashley/PD_Stim/pbmc.fibril.final.RDS")